## libraries
# library(confintr)
# library(rstan)
# library(rstanarm)
library(glue)
library(fs)
# library(vroom)
library(broom)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(stringr)
library(scales)
library(ggplot2)
# library(patchwork)
# library(reactable)
library(workflows)
library(workflowsets)
library(rsample)
library(themis)
## Loading required package: recipes
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stringr':
##
## fixed
## The following object is masked from 'package:stats':
##
## step
# library(recipes)
library(parsnip)
library(finetune)
## Loading required package: tune
# library(tune)
library(dials)
library(yardstick)
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:scales':
##
## discard
library(tidyr)
library(dplyr)
## r session opts
options(scipen = 99L, mc.cores = 8L)
# rstan_options(auto_write = TRUE)
theme_set(theme_minimal())
## seed
seed <- 1234L
set.seed(seed)
## name of independent/outcome/target variable
outcome_col <- "SURGERY"
## indicator levels
ind_lvls <- c("no", "yes")
## event level/label
event_lvl <- "second"
event_lbl <- ind_lvls[2L]
## unique id column
uid_col <- "MEMBER_ID"
## default join spec
join_mbr <- join_by(MEMBER_ID)
## color palette
# cpal <- ggsci::pal_igv(palette = "default", alpha = 1)(51L)
bin_bin_metrics <- metric_set(mcc, f_meas, j_index, bal_accuracy, kap)
## walk through data files and load into r env
dir_data() |>
fs::dir_ls() |>
purrr::walk(\(x) {
## env var name to assign data frame to
env_obj_nm <- fs::path_file(x) |> str_remove_all(stringr::fixed(".txt"))
env_obj_nm <- paste("df", tolower(env_obj_nm), sep = "_")
## import data
df <- vroom::vroom(x, delim = " ", show_col_types = FALSE)
## check for parsing warnings/issues
if (nrow(vroom::problems(df)) > 0L) {
cli::cli_warn("parsing issues found with {.file x}, inspect manually")
} else {
## glimpse data structure for non-test data
if (!endsWith(env_obj_nm, "test")) {
cli::cli_h3("{env_obj_nm}")
glimpse(df, 80L)
}
assign(env_obj_nm, df, pos = .GlobalEnv)
}
})
##
## ── df_claims_train
## Rows: 91,786
## Columns: 3
## $ MEMBER_ID <chr> "V2045033839", "U3368542783", "V7794221118", "L9612031535", …
## $ CLAIM_ID <chr> "DWD-1644TI185", "JAV-0750KB107", "KNZ-3862GG349", "XPI-7191…
## $ LOS_CODE <chr> "02", "11", "11", "22", "11", "11", "20", "20", "02", "20", …
## Multiple files in zip: reading '[Content_Types].xml'
##
## ── df_dxccsr-reference-file-v2023-1.xlsx
## Rows: 5
## Columns: 4
## $ `<?xml` <chr> "<Types", "ContentType=\"application/vnd.openxmlf…
## $ `version="1.0"` <chr> "xmlns=\"http://schemas.openxmlformats.org/packag…
## $ `encoding="UTF-8"` <chr> "Extension=\"bin\"", "ContentType=\"application/x…
## $ `standalone="yes"?>` <chr> "ContentType=\"application/vnd.openxmlformats-off…
##
## ── df_location_of_service
## Rows: 7
## Columns: 2
## $ LOS_DESCR <chr> "Emergency Room", "Outpatient", "Inpatient", "Professional",…
## $ LOS_CODE <chr> "23", "22", "21", "11", "20", "34", "02"
##
## ── df_surgeries_test_scored.csv
## Rows: 10,491
## Columns: 1
## $ `MEMBER_ID,PREDICTION,predicted_class` <chr> "C4766121375,1.5577000500949957…
##
## ── df_surgeries_train
## Rows: 41,965
## Columns: 9
## $ SURGERY <chr> "no", "no", "no", "no", "no", "no", "no", "no", "n…
## $ MEMBER_ID <chr> "W2648842706", "H4887816845", "N3902548962", "P318…
## $ ICD10 <chr> "M25611", "Z6854", "M79604", "M71122", "M25549", "…
## $ WEIGHTED_RISK_SCORE <dbl> 5.477114, 4.583526, 5.387169, 5.368089, 4.177909, …
## $ DIABETES <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ OPIOID_RX <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ CANCER <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ AGE <dbl> 25.07722, 33.45827, 37.78651, 18.19988, 49.48217, …
## $ GENDER <chr> "F", "F", "M", "F", "F", "M", "M", "M", "M", "F", …
## calculate outcome proportion
df_outcome_summ <- df_surgeries_train |>
summarise(n = n(), .by = all_of(outcome_col)) |>
mutate(prop = n / sum(n))
## positive event/no info rate
outcome_pos_prop <- df_outcome_summ |> filter(SURGERY == "yes") |> pull(prop)
# glue("Positive Event Rate: {percent(outcome_pos_prop, .01)}")
no_info_rt <- outcome_pos_prop
if (no_info_rt < .5) no_info_rt <- 1 - no_info_rt
df_outcome_summ |>
# rename_with()
reactable::reactable(columns = list(
SURGERY = reactable::colDef("Surgery?"),
n = reactable::colDef("Count", format = reactable::colFormat(separators = T)),
prop = reactable::colDef("Proportion", format = reactable::colFormat(percent = T, digits = 2L))
))
df_icd_summ_base <- df_surgeries_train |>
summarise(
value = n(),
.by = c(SURGERY, ICD10)
) |>
pivot_wider(names_from = SURGERY, values_fill = 0L) |>
rowwise() |>
mutate(total = sum(c_across(!ICD10))) |>
ungroup() |>
arrange(total, yes) |>
mutate(
prop = total / sum(total),
cume_prop = cumsum(total) / sum(total),
cume_pos_prop = cumsum(yes) / sum(yes)
) |>
arrange(desc(cume_prop))
df_icd_summ_base |>
mutate(idx = row_number()) |>
select(idx, starts_with("cume_")) |>
pivot_longer(cols = !idx) |>
ggplot(aes(idx, value, col = name)) +
geom_line() +
geom_vline(xintercept = 89, col = "coral2", lty = 2L) +
scale_y_continuous(labels = label_percent()) +
scale_x_continuous(labels = label_comma()) +
labs(title = "Cumulative % of Dx", x = "Dx Count", y = NULL)
df_icd_summ_base |> slice_head(n = 89L)
## # A tibble: 89 × 7
## ICD10 no yes total prop cume_prop cume_pos_prop
## <chr> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 M79642 249 8 257 0.00612 1 1
## 2 M79632 250 4 254 0.00605 0.994 0.991
## 3 M25621 249 4 253 0.00603 0.988 0.986
## 4 M25529 238 9 247 0.00589 0.982 0.982
## 5 M545 241 4 245 0.00584 0.976 0.972
## 6 M79676 237 6 243 0.00579 0.970 0.967
## 7 M79621 238 4 242 0.00577 0.964 0.960
## 8 M25662 240 2 242 0.00577 0.959 0.956
## 9 M25551 231 10 241 0.00574 0.953 0.954
## 10 M25531 235 5 240 0.00572 0.947 0.942
## # ℹ 79 more rows
# df_icd_summ_base |> slice_tail(n = -89L)
## Dx codes by number of characters
df_surgeries_train |>
mutate(len = nchar(ICD10)) |>
summarise(
n = n(),
n_distinct = n_distinct(ICD10),
.by = len
)
## # A tibble: 4 × 3
## len n n_distinct
## <int> <int> <int>
## 1 6 23852 367
## 2 5 10624 387
## 3 4 7215 270
## 4 3 274 12
## trim DX codes to first n characters
## common first 5 characters summary
df_icd_5_summ_base <- df_surgeries_train |>
mutate(ICD = str_sub(ICD10, 1L, 5L)) |>
summarise_dx()
df_icd_5_summ_base |>
mutate(idx = row_number()) |>
select(idx, starts_with("cume_")) |>
pivot_longer(cols = !idx) |>
ggplot(aes(idx, value, col = name)) +
geom_line() +
geom_vline(xintercept = 22, col = "coral2", lty = 2L) +
geom_hline(yintercept = .6, col = "coral2", lty = 2L) +
coord_cartesian(xlim = c(0L, 90L)) +
scale_y_continuous(labels = label_percent()) +
scale_x_continuous(labels = label_comma()) +
labs(title = "Cumulative % of Dx", x = "Dx Count", y = NULL)
## common first 4 characters summary (excluding common 5 character codes)
df_icd_4_summ_base <- df_surgeries_train |>
mutate(ICD = str_sub(ICD10, 1L, 4L)) |>
summarise_dx()
## plot common 4 character codes
df_icd_4_summ_base |>
mutate(idx = row_number()) |>
select(idx, starts_with("cume_")) |>
pivot_longer(cols = !idx) |>
ggplot(aes(idx, value, col = name)) +
geom_line() +
geom_vline(xintercept = 4, col = "coral2", lty = 2L) +
geom_hline(yintercept = .6, col = "coral2", lty = 2L) +
coord_cartesian(xlim = c(0L, 45L)) +
scale_y_continuous(labels = label_percent()) +
scale_x_continuous(labels = label_comma()) +
labs(title = "Cumulative % of Dx", x = "Dx Count", y = NULL)
## most common 4 character codes
icd_4_v <- df_icd_4_summ_base |> slice_head(n = 4L) |> pull(ICD)
## common first 3 characters summary (excluding common 4 & 5 character codes)
df_icd_3_summ_base <- df_surgeries_train |>
mutate(
ICD_4 = str_sub(ICD10, 1L, 4L),
ICD_4_IND = case_match(ICD_4, icd_4_v ~ "yes", .default = "no"),
ICD = case_match(ICD_4_IND, "no" ~ str_sub(ICD10, 1L, 3L), .default = NA_character_)
) |>
filter(!is.na(ICD)) |>
summarise_dx()
## plot common 3 character codes
df_icd_3_summ_base |>
mutate(idx = row_number()) |>
select(idx, starts_with("cume_")) |>
pivot_longer(cols = !idx) |>
ggplot(aes(idx, value, col = name)) +
geom_line() +
coord_cartesian(xlim = c(0L, 45L)) +
scale_y_continuous(labels = label_percent()) +
scale_x_continuous(labels = label_comma()) +
labs(title = "Cumulative % of Dx", x = "Dx Count", y = NULL)
## most common 3 character codes
icd_3_v <- df_icd_3_summ_base |> slice_head(n = 7L) |> pull(ICD)
## common first 2 characters summary (excluding common 3, 4, & 5 character codes)
df_icd_2_summ_base <- df_surgeries_train |>
mutate(
ICD_4 = str_sub(ICD10, 1L, 4L),
ICD_4_IND = case_match(ICD_4, icd_4_v ~ TRUE, .default = FALSE),
ICD_3 = str_sub(ICD10, 1L, 3L),
ICD_3_IND = case_match(ICD_3, icd_3_v ~ TRUE, .default = FALSE),
ICD_TRIM_IND = case_when(!ICD_3_IND & !ICD_4_IND ~ "no", .default = "yes"),
ICD = case_match(ICD_TRIM_IND, "no" ~ str_sub(ICD10, 1L, 2L), .default = NA_character_)
) |>
filter(!is.na(ICD)) |>
summarise_dx()
## plot common 2 character codes
df_icd_2_summ_base |>
mutate(idx = row_number()) |>
select(idx, starts_with("cume_")) |>
pivot_longer(cols = !idx) |>
ggplot(aes(idx, value, col = name)) +
geom_line() +
scale_y_continuous(labels = label_percent()) +
scale_x_continuous(labels = label_comma()) +
labs(title = "Cumulative % of Dx", x = "Dx Count", y = NULL)
## most common 2 character codes
icd_2_v <- df_icd_2_summ_base |> slice_head(n = 12L) |> pull(ICD)
## add common Dx code indicators
df_icd_trim <- df_surgeries_train |> select(all_of(c(uid_col, outcome_col)), ICD10)
for (i in sort(c(icd_4_v, icd_3_v, icd_2_v))) {
df_icd_trim <- df_icd_trim |> mutate(
"{i}" := case_match(str_sub(ICD10, 1L, nchar(i)), i ~ "yes", .default = "no")
)
}
df_icd_trim <- df_icd_trim |> select(!ICD10)
## calculate outcome association
icd_metrics <- function(x) {
# x = "M256"
dat <- df_icd_trim |>
rename_with(\(y) "estimate", all_of(x)) |>
select(all_of(c(uid_col, outcome_col)), estimate) |>
distinct() |>
mutate(across(c(all_of(outcome_col), estimate), \(x) factor(x, ind_lvls)))
cm <- conf_mat(dat, truth = SURGERY, estimate = estimate)
cramers_v <- confintr::ci_cramersv(cm$table)
cramers_v <- c(cramers_v$estimate, cramers_v$interval) |>
vctrs::vec_set_names(c("cramers_v", "cramers_v_lb", "cramers_v_ub"))
odds_ratio <- confintr::ci_oddsratio(cm$table)
odds_ratio <- log(c(odds_ratio$estimate, odds_ratio$interval)) |>
vctrs::vec_set_names(c("odds_ratio", "odds_ratio_lb", "odds_ratio_ub"))
out_1 <- bind_cols(t(cramers_v), t(odds_ratio))
out_2 <- dat |>
bin_bin_metrics(truth = SURGERY, estimate = estimate, event_level = event_lvl) |>
select(!.estimator) |>
pivot_wider(names_from = .metric, values_from = .estimate)
bind_cols(out_1, out_2)
}
df_icd_metrics_res <- tibble(ICD = c(icd_4_v, icd_3_v, icd_2_v)) |>
mutate(data = map(ICD, icd_metrics)) |>
unnest(data)
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `data = map(ICD, icd_metrics)`.
## Caused by warning in `ci_chisq_ncp()`:
## ! Upper limit outside search range. Set to the maximum of the parameter range.
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
df_icd_metrics_res |> arrange(desc(cramers_v))
## # A tibble: 23 × 12
## ICD cramers_v cramers_v_lb cramers_v_ub odds_ratio odds_ratio_lb
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M54 0.00878 0 0.0190 -0.417 -0.933
## 2 M26 0.00627 0 0.0166 0.201 -0.130
## 3 Z6 0.00558 0 0.0159 -0.229 -0.664
## 4 M256 0.00552 0 0.0159 -0.119 -0.334
## 5 M6 0.00505 0 0.0154 0.237 -0.267
## 6 R2 0.00447 0 0.0149 0.209 -0.294
## 7 K7 0.00428 0 0.0147 -0.247 -0.886
## 8 F11 0.00417 0 0.0146 0.179 -0.277
## 9 F1 0.00395 0 0.0144 0.113 -0.179
## 10 M70 0.00331 0 0.0137 -0.127 -0.530
## # ℹ 13 more rows
## # ℹ 6 more variables: odds_ratio_ub <dbl>, mcc <dbl>, f_meas <dbl>,
## # j_index <dbl>, bal_accuracy <dbl>, kap <dbl>
df_icd_metrics_res |>
select(ICD, starts_with("odds_ratio")) |>
mutate(
ICD = factor(ICD, ordered = TRUE),
ICD = reorder(ICD, desc(abs(odds_ratio)))
) |>
ggplot(aes(ICD)) +
geom_point(aes(y = odds_ratio)) +
geom_errorbar(aes(ymin = odds_ratio_lb, ymax = odds_ratio_ub)) +
scale_y_continuous(labels = label_percent()) +
# scale_x_continuous(labels = label_comma()) +
theme(axis.text.x = element_text(angle = 90L, hjust = 1L, vjust = .5))
labs(title = "Log Odds Ratio", x = "Dx Count", y = NULL)
## $x
## [1] "Dx Count"
##
## $y
## NULL
##
## $title
## [1] "Log Odds Ratio"
##
## attr(,"class")
## [1] "labels"
## select top Dx codes
icds_v <- df_icd_metrics_res |> slice_max(abs(odds_ratio), n = 10L) |> pull(ICD)
df_icd_trim <- df_icd_trim |> select(all_of(c(uid_col, sort(icds_v))))
Find most predictive Dx codes based onf
df_icd_3_base <- df_surgeries_train |>
mutate(estimate = str_sub(ICD10, 1L, 3L)) |>
select(all_of(c(uid_col, outcome_col)), estimate) |>
mutate(across(all_of(outcome_col), \(x) factor(x, ind_lvls)))
icd_3_metrics <- function(x) {
dat <- df_icd_3_base |>
mutate(estimate = factor(
case_match(estimate, x ~ "yes", .default = "no"),
ind_lvls
))
cm <- conf_mat(dat, truth = SURGERY, estimate = estimate)
cramers_v <- confintr::ci_cramersv(cm$table)
cramers_v <- c(cramers_v$estimate, cramers_v$interval) |>
vctrs::vec_set_names(c("cramers_v", "cramers_v_lb", "cramers_v_ub"))
odds_ratio <- confintr::ci_oddsratio(cm$table)
odds_ratio <- log(c(odds_ratio$estimate, odds_ratio$interval)) |>
vctrs::vec_set_names(c("odds_ratio", "odds_ratio_lb", "odds_ratio_ub"))
out_1 <- bind_cols(t(cramers_v), t(odds_ratio))
out_2 <- dat |>
bin_bin_metrics(truth = SURGERY, estimate = estimate, event_level = event_lvl) |>
select(!.estimator) |>
pivot_wider(names_from = .metric, values_from = .estimate)
bind_cols(out_1, out_2)
}
df_icd_3_metric_res <- df_icd_3_base |>
summarise(n = n(), .by = estimate) |>
mutate(prop = n / sum(n)) |>
filter((n / sum(n)) > .01) |>
select(icd_3 = estimate) |>
mutate(data = map(icd_3, icd_3_metrics)) |>
unnest(data) |>
arrange(desc(abs(odds_ratio)))
icd_3_top_v <- df_icd_3_metric_res |>
slice_head(n = 11L) |>
pull(icd_3)
summary(df_surgeries_train)
## SURGERY MEMBER_ID ICD10 WEIGHTED_RISK_SCORE
## Length:41965 Length:41965 Length:41965 Min. : 1.112
## Class :character Class :character Class :character 1st Qu.: 4.548
## Mode :character Mode :character Mode :character Median : 5.229
## Mean : 5.244
## 3rd Qu.: 5.932
## Max. :10.810
## DIABETES OPIOID_RX CANCER AGE
## Min. :0.0000 Min. :0.00000 Min. :0.000000 Min. :13.28
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:26.70
## Median :0.0000 Median :0.00000 Median :0.000000 Median :35.03
## Mean :0.0921 Mean :0.05438 Mean :0.002359 Mean :35.14
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:43.21
## Max. :1.0000 Max. :1.00000 Max. :1.000000 Max. :59.21
## GENDER
## Length:41965
## Class :character
## Mode :character
##
##
##
df_surgeries_train |>
summarise(across(everything(), \(x) mean(is.na(x)))) |>
pivot_longer(everything()) |>
arrange(desc(value), name)
## # A tibble: 9 × 2
## name value
## <chr> <dbl>
## 1 AGE 0
## 2 CANCER 0
## 3 DIABETES 0
## 4 GENDER 0
## 5 ICD10 0
## 6 MEMBER_ID 0
## 7 OPIOID_RX 0
## 8 SURGERY 0
## 9 WEIGHTED_RISK_SCORE 0
No NULL’s?
df_mbr_clm_cts <- process_clms(df_claims_train)
feat_clm_type_col <- setdiff(colnames(df_mbr_clm_cts), uid_col)
feat_col <- colnames(df_surgeries_train) |>
setdiff(c(uid_col, outcome_col, "ICD10", "GENDER")) |>
c(icd_3_top_v, "MALE_IND", feat_clm_type_col) |>
c(feat_clm_type_col)
bin_col <- c("OPIOID_RX", "CANCER", "DIABETES", "MALE_IND", icd_3_top_v)
# multi_col <- c("ICD")
cont_col <- feat_col |> setdiff(bin_col)
feat_form <- formula(paste(outcome_col, paste(feat_col, collapse = " + "), sep = " ~ "))
df_train_base <- df_surgeries_train |>
left_join(df_mbr_clm_cts, by = join_mbr) |>
left_join(process_icds(df_surgeries_train), by = join_mbr) |>
mutate(
MALE_IND = case_match(GENDER, "M" ~ "yes", .default = "no"),
CANCER = case_match(CANCER, 1L ~ "yes", .default = "no"),
DIABETES = case_match(DIABETES, 1L ~ "yes", .default = "no"),
OPIOID_RX = case_match(OPIOID_RX, 1L ~ "yes", .default = "no"),
across(all_of(feat_clm_type_col), \(x) coalesce(x, 0)),
across(all_of(c(outcome_col, bin_col)), \(x) factor(x, ind_lvls))
) |>
select(all_of(c(uid_col, outcome_col, feat_col)))
## check for nulls
df_train_base |>
summarise(across(all_of(feat_col), \(x) mean(is.na(x)))) |>
pivot_longer(everything()) |>
arrange(desc(value), name)
## # A tibble: 23 × 2
## name value
## <chr> <dbl>
## 1 AGE 0
## 2 CANCER 0
## 3 DIABETES 0
## 4 EMERGENCY_ROOM 0
## 5 F10 0
## 6 F11 0
## 7 G56 0
## 8 INPATIENT 0
## 9 K85 0
## 10 M25 0
## # ℹ 13 more rows
map(bin_col, \(x) {
df_train_base |>
rename_with(\(y) "var", all_of(x)) |>
summarise(n = n(), .by = c(var, SURGERY)) |>
mutate(prop = n / sum(n), .by = SURGERY) |>
ggplot(aes(SURGERY, prop, fill = var)) +
geom_col(position = "dodge", alpha = .7, show.legend = FALSE) +
facet_grid(rows = vars(var), scales = "free_y") +
scale_y_continuous(labels = label_percent()) +
labs(title = x, y = "Proportion by Surgery Indicator")
})
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
map(cont_col, \(x) {
df_train_base |>
rename_with(\(y) "var", all_of(x)) |>
ggplot(aes(var, fill = SURGERY)) +
geom_histogram(bins = 30L, alpha = .7, show.legend = FALSE) +
facet_grid(rows = vars(SURGERY), scales = "free_y") +
scale_y_continuous(labels = label_comma(), trans = "log1p") +
labs(title = glue::glue("{x} by {outcome_col}"), x = NULL)
})
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
## continuous feature intercorrelation
df_train_base |>
select(all_of(cont_col)) |>
corrr::correlate() |>
corrr::rearrange(method = "HC") |>
corrr::autoplot()
## Correlation computed with
## • Method: 'pearson'
## • Missing treated using: 'pairwise.complete.obs'
## binary feature intercorrelation -- matthew's correlation coefficient
df_train_base |>
select(all_of(c(outcome_col, bin_col))) |>
corrr::colpair_map(.f = mcc_vec) |>
corrr::rearrange(method = "HC") |>
corrr::autoplot()
df_mbr_clm_cts <- process_clms(df_claims_train)
feat_clm_type_col <- setdiff(colnames(df_mbr_clm_cts), uid_col)
excl_col <- c(
"TELEHEALTH", "CANCER", "DIABETES", "GENDER", "URGENT_CARE", "AGE",
"PROFESSIONAL", "G56", "K85", "F11", "M25", "F10", "M71", "M60" ,"Z62", "M26"
)
feat_col <- colnames(df_surgeries_train) |>
setdiff(c(uid_col, outcome_col, "ICD10")) |>
# c(icds_v) |>
c(icd_3_top_v) |>
c(feat_clm_type_col) |>
setdiff(excl_col)
bin_col <- c("OPIOID_RX", icd_3_top_v) |> setdiff(excl_col)
# multi_col <- c("ICD")
cont_col <- feat_col |> setdiff(bin_col)
feat_form <- formula(paste(outcome_col, paste(feat_col, collapse = " + "), sep = " ~ "))
df_train_base <- process_predict(df_surgeries_train, df_claims_train)
df_train_base |>
summarise(across(all_of(feat_col), \(x) mean(is.na(x)))) |>
pivot_longer(everything()) |>
arrange(desc(value), name)
## # A tibble: 7 × 2
## name value
## <chr> <dbl>
## 1 EMERGENCY_ROOM 0
## 2 INPATIENT 0
## 3 M54 0
## 4 M70 0
## 5 OPIOID_RX 0
## 6 OUTPATIENT 0
## 7 WEIGHTED_RISK_SCORE 0
map(bin_col, \(x) {
df_train_base |>
rename_with(\(y) "var", all_of(x)) |>
summarise(n = n(), .by = c(var, SURGERY)) |>
mutate(prop = n / sum(n), .by = SURGERY) |>
ggplot(aes(SURGERY, prop, fill = var)) +
geom_col(position = "dodge", alpha = .7, show.legend = FALSE) +
facet_grid(rows = vars(var), scales = "free_y") +
scale_y_continuous(labels = label_percent()) +
labs(title = x, y = "Proportion by Surgery Indicator")
})
## [[1]]
##
## [[2]]
##
## [[3]]
map(cont_col, \(x) {
df_train_base |>
rename_with(\(y) "var", all_of(x)) |>
ggplot(aes(var, fill = SURGERY)) +
geom_histogram(bins = 30L, alpha = .7, show.legend = FALSE) +
facet_grid(rows = vars(SURGERY), scales = "free_y") +
scale_y_continuous(labels = label_comma()) +
labs(title = glue::glue("{x} by {outcome_col}"), x = NULL)
})
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
## continuous feature intercorrelation
df_train_base |>
select(all_of(cont_col)) |>
corrr::correlate(quiet = TRUE) |>
corrr::rearrange(method = "HC") |>
corrr::autoplot()
## binary feature intercorrelation -- matthew's correlation coefficient
df_train_base |>
select(all_of(c(outcome_col, bin_col))) |>
corrr::colpair_map(.f = mcc_vec) |>
corrr::rearrange(method = "HC") |>
corrr::autoplot()
Split data into train/validation and define resampling object
## training/validation split
tv_split <- df_train_base |> rsample::initial_split(prop = .7)
## resamples object (cross-validation)
df_folds <- training(tv_split) |> vfold_cv(v = 4L, repeats = 4L)
## logistic regression recipe
log_recipe <- recipe(formula = feat_form, data = training(tv_split)) |>
step_downsample(under_ratio = 2L) |>
step_mutate_at(all_of(bin_col), fn = \(x) {
case_match(x, "yes" ~ 1L, .default = 0L)
}) |>
# step_dummy(OPIOID_RX) |>
# step_rename(OPIOID_RX = OPIOID_RX_yes) |>
# step_interact(terms = ~ OUTPATIENT:OPIOID_RX) |>
# step_rm(OPIOID_RX) |>
step_log(all_of(cont_col), offset = 1) |>
step_zv(all_predictors()) |>
step_normalize(all_numeric_predictors())
## logistic regression model spec
# library(multilevelmod)
t_prior <- rstanarm::student_t(df = 7, location = 0, scale = 2.5)
log_spec <- logistic_reg(mode = "classification") |>
set_engine(
engine = "stan",
family = binomial(link = "logit"),
prior = t_prior,
prior_intercept = t_prior,
seed = seed,
cores = 8L
)
# log_spec <- logistic_reg(
# mode = "classification",
# engine = "glmnet",
# penalty = tune(),
# mixture = 0
# )
## logistic regression workflow
log_wf <- workflow() |>
add_recipe(log_recipe) |>
add_model(log_spec)
# ## tree parameter ranges
# log_param <- extract_parameter_set_dials(log_wf) |>
# update(
# penalty = penalty(c(-9, -3))
# # mixture = mixture(c(0, 1))
# )
# # tree parameter grid
# log_grid <- grid_max_entropy(log_param, size = 4L, variogram_range = 2 / 3)
## tree recipe
tree_recipe <- recipe(formula = feat_form, data = training(tv_split)) |>
step_downsample(under_ratio = 2L)
## tree model spec
tree_spec <- decision_tree(
cost_complexity = tune(),
tree_depth = tune(),
min_n = tune()
) |>
set_mode("classification") |>
set_engine(engine = "rpart")
## tree workflow
tree_wf <- workflow() |>
add_recipe(tree_recipe) |>
add_model(tree_spec)
## tree parameter ranges
tree_param <- extract_parameter_set_dials(tree_wf) |>
update(
cost_complexity = cost_complexity(c(-10, -2)),
tree_depth = tree_depth(c(4L, 8L)),
min_n = min_n(c(30L, 120L))
)
## tree parameter grid
tree_grid <- grid_max_entropy(tree_param, size = 9L, variogram_range = 2 / 3)
## random forest model spec
rf_spec <- rand_forest(mtry = tune(), min_n = tune(), trees = 300L) |>
set_mode("classification") |>
set_engine(
engine = "ranger",
max.depth = 10L, #tune("max.depth"),
# regularization.factor = tune("regularization_factor"),
# regularization.usedepth = TRUE,
num.threads = 8L
)
## random forest workflow
rf_wf <- workflow() |> add_recipe(tree_recipe) |> add_model(rf_spec)
## random forest parameter ranges
rf_param <- extract_parameter_set_dials(rf_wf) |>
update(
mtry = mtry(c(2L, 4L)),
min_n = min_n(c(20L, 50L))
# trees = trees(c(300L, 600L))
# max.depth = tree_depth(c(8L, 12L))
# regularization_factor = regularization_factor(c(0, 1))
)
## random forest parameter grid
rf_grid <- grid_max_entropy(rf_param, size = 8L, variogram_range = 2 / 3)
model_wf_set <- as_workflow_set(
log = log_wf,
tree = tree_wf,
rf = rf_wf
)
## tune grid control object
ctrl_tune_grid <- control_resamples(verbose = FALSE, event_level = event_lvl)
ctrl_tune_search <- control_sim_anneal(
verbose = FALSE, no_improve = 8L, restart = 8L, event_level = event_lvl,
save_workflow = T
)
## evaluation metrics
eval_metrics <- metric_set(gain_capture, roc_auc, average_precision)
## random tune grid
set.seed(seed)
tune_grid_res <- model_wf_set |>
option_add(id = "tree", param_info = tree_param, grid = tree_grid) |>
option_add(id = "rf", param_info = rf_param, grid = rf_grid) |>
workflow_map(
fn = "tune_grid",
seed = seed,
verbose = FALSE,
resamples = df_folds,
metrics = eval_metrics,
control = ctrl_tune_grid
)
## tune grid results
autoplot(tune_grid_res, metric = "roc_auc")
tune_search_res <- tune_grid_res |>
option_remove(grid) |>
# model_wf_set |>
# filter(wflow_id == "tree") |>
filter(wflow_id != "log") |>
# option_add(id = "tree", initial = extract_workflow_set_result(tune_grid_res, id = "tree")) |>
# option_add(id = "rf", initial = extract_workflow_set_result(tune_grid_res, id = "rf")) |>
workflow_map(
fn = "tune_sim_anneal",
verbose = FALSE,
seed = seed,
# resamples = df_folds,
iter = 8L,
# metrics = eval_metrics,
control = ctrl_tune_search
)
## Warning: There are existing options that are being modified
## tree: 'control'
## rf: 'control'
## Optimizing gain_capture
## Initial best: 0.70065
## 1 ◯ accept suboptimal gain_capture=0.70065 (+/-0.01733)
## 2 ♥ new best gain_capture=0.70335 (+/-0.01457)
## 3 ─ discard suboptimal gain_capture=0.6742 (+/-0.008733)
## 4 ♥ new best gain_capture=0.80484 (+/-0.01746)
## 5 ─ discard suboptimal gain_capture=0.72608 (+/-0.01125)
## 6 ◯ accept suboptimal gain_capture=0.76546 (+/-0.01878)
## 7 + better suboptimal gain_capture=0.79854 (+/-0.0176)
## 8 ♥ new best gain_capture=0.81571 (+/-0.01542)
## Optimizing gain_capture
## Initial best: 0.96057
## 1 ♥ new best gain_capture=0.96284 (+/-0.001395)
## 2 ◯ accept suboptimal gain_capture=0.96188 (+/-0.001268)
## 3 + better suboptimal gain_capture=0.9624 (+/-0.001374)
## 4 ◯ accept suboptimal gain_capture=0.96233 (+/-0.001378)
## 5 + better suboptimal gain_capture=0.96274 (+/-0.001379)
## 6 ◯ accept suboptimal gain_capture=0.96051 (+/-0.001507)
## 7 + better suboptimal gain_capture=0.96245 (+/-0.001388)
## 8 ◯ accept suboptimal gain_capture=0.96102 (+/-0.001482)
# tune_search_res <- structure(
# tune_search_res |> bind_rows(tune_grid_res |> filter(wflow_id != "tree")),
# class = c("workflow_set", class(tune_search_res))
# )
## tune grid results
autoplot(tune_search_res, metric = c("roc_auc", "average_precision"))
# autoplot(tune_grid_res, id = "log", metric = "roc_auc")
autoplot(tune_search_res, id = "tree", metric = "roc_auc")
autoplot(tune_search_res, id = "rf", metric = "roc_auc")
## logistic
## stan
log_fit <- log_wf |> fit(data = training(tv_split))
## glm/glmnet
# log_fit_param <- tune_grid_res |>
# extract_workflow_set_result(id = "log") |>
# select_by_pct_loss(
# desc(penalty), mixture,
# metric = "gain_capture",
# limit = 2
# )
## decision tree
tree_fit_param <- tune_search_res |>
extract_workflow_set_result(id = "tree") |>
select_by_pct_loss(
tree_depth, desc(min_n), desc(cost_complexity),
metric = "gain_capture",
limit = 2
)
tree_fit <- tree_wf |>
finalize_workflow(tree_fit_param) |>
fit(data = training(tv_split))
## random forest
rf_fit_param <- tune_search_res |>
extract_workflow_set_result(id = "rf") |> #collect_metrics()
select_by_pct_loss(
mtry, desc(min_n), #trees,
metric = "gain_capture",
limit = 2
)
rf_fit <- rf_wf |>
finalize_workflow(rf_fit_param) |>
fit(data = training(tv_split))
rf_imp_fit <- rf_wf |>
update_model(
spec = rand_forest(mtry = tune(), min_n = tune(), trees = 300L) |>
set_mode("classification") |>
set_engine(
engine = "ranger",
importance = "impurity_corrected",
max.depth = 10L, #tune("max.depth"),
num.threads = 8L
)
)|>
finalize_workflow(rf_fit_param) |>
fit(data = training(tv_split))
## model fit list
fit_ls <- list(log = log_fit, tree = tree_fit, rf = rf_fit)
log_stan_fit <- log_fit |> extract_fit_engine()
bayesplot::color_scheme_set("viridis")
# log_stan_fit |> plot()
log_stan_fit |> plot(plotfun = "areas", prob = 0.9)
# log_stan_fit |> plot(plotfun = "combo")
## logistic
df_log_imp <- broom.mixed::tidyMCMC(log_stan_fit) |>
filter(term != "(Intercept)") |>
mutate(
Variable = term,
Importance = 100 * abs(estimate) / sum(abs(estimate)),
.keep = "none"
)
## tree
df_tree_imp <- tree_fit |> extract_fit_parsnip() |> vip::vi(scale = TRUE)
## random forest
df_rf_imp <- rf_imp_fit |> extract_fit_parsnip() |> vip::vi(scale = TRUE)
## plot
bind_rows(
df_log_imp |> mutate(wflow_id = "log"),
df_tree_imp |> mutate(wflow_id = "tree"),
df_rf_imp |> mutate(wflow_id = "rf")
) |>
order_var_fct() |>
ggplot(aes(Importance, Variable)) +
geom_col() +
facet_grid(cols = vars(wflow_id)) +
labs(title = "Feature Importance", y = NULL)
extract_fit_engine(tree_fit) |>
rpart.plot::prp(
type = 2, varlen = 0, faclen = 0, extra = 106, roundint = FALSE, digits = 3
)
df_to_score <- training(tv_split) |>
mutate(data_set = "train") |>
bind_rows(testing(tv_split)) |>
mutate(
data_set = coalesce(data_set, "vald"),
across(all_of(outcome_col), \(x) factor(x, ind_lvls))
)
df_eval <- imap(fit_ls, \(x, y) {
df_to_score |>
mutate(
estimate = predict(x, pick(all_of(feat_col)), type = "prob")$.pred_yes,
wflow_id = {{ y }}
) |>
select(all_of(c(uid_col, outcome_col)), estimate, data_set, wflow_id)
}) |>
list_rbind()
# df_eval <- df_to_score |>
# mutate(
# estimate = predict(rf_fit, pick(all_of(feat_col)), type = "prob")$.pred_yes,
# ) |>
# select(all_of(c(uid_col, outcome_col, estimate, wflow_id)))
df_eval |>
group_by(data_set, wflow_id) |>
ggplot(aes(estimate, fill = SURGERY)) +
geom_density(alpha = .7) +
facet_grid(rows = vars(SURGERY), cols = vars(wflow_id), scales = "free_y") +
scale_x_continuous(breaks = seq(0, .75, .25), labels = label_percent(), trans = "sqrt") +
scale_fill_viridis_d() +
theme(
legend.position = "top", legend.justification = "right",
legend.margin = margin(t = -13, r = 5, b = -7)
) +
labs(title = "Predicted Probability Density")
## model metric table
df_eval |>
group_by(data_set, wflow_id) |>
eval_metrics(truth = all_of(outcome_col), estimate, event_level = event_lvl) |>
select(!.estimator) |>
mutate(.estimate = round(.estimate, 3)) |>
pivot_wider(names_from = data_set, values_from = .estimate) |>
mutate(
.metric = str_replace_all(.metric, "_", " "),
spread = vald - train
) |>
reactable::reactable(
columns = list(
.metric = reactable::colDef("Metric", minWidth = 125L),
wflow_id = reactable::colDef("Model", maxWidth = 75L),
train = reactable::colDef("Train", format = reactable::colFormat(percent = T, digits = 2L)),
vald = reactable::colDef("Validation", format = reactable::colFormat(percent = T, digits = 2L)),
spread = reactable::colDef("Validation - Train", format = reactable::colFormat(percent = T, digits = 2L))
),
groupBy = ".metric",
highlight = T, #outlined = T
bordered = T
)
## roc auc curve
df_eval |>
group_by(data_set, wflow_id) |>
roc_curve(truth = all_of(outcome_col), estimate, event_level = event_lvl) |>
# autoplot()
filter(!is.infinite(.threshold)) |>
ggplot(aes(1 - specificity, sensitivity, color = wflow_id)) +
geom_line(aes(lty = data_set)) +
geom_abline(slope = 1, intercept = 0, lty = 3L) +
scale_x_continuous(labels = label_percent(), expand = c(0,0)) +
scale_y_continuous(labels = label_percent(), expand = c(0,0)) +
labs(title = "ROC AUC", lty = "Dataset", color = "Model")
## lift curve
df_eval |>
group_by(data_set, wflow_id) |>
lift_curve(truth = all_of(outcome_col), estimate, event_level = event_lvl) |>
# autoplot()
filter(!is.infinite(.lift)) |>
ggplot(aes(.percent_tested, .lift, color = wflow_id)) +
geom_line(aes(lty = data_set)) +
scale_x_continuous(labels = label_percent(scale = 1), expand = c(0,0)) +
scale_y_continuous(labels = label_math(~ .x * x), expand = c(0,0)) +
labs(title = "Lift", x = "% Tested", y = "Lift", lty = "Dataset", color = "Model")
## Warning: Removed 6 rows containing missing values (`geom_line()`).
Choose threshold based on the J-Index using the weighted average of the training/validation set’s.
## score with logistic model
df_log_eval <- df_to_score |>
mutate(
estimate = predict(log_fit, pick(all_of(feat_col)), type = "prob")$.pred_yes
) |>
select(all_of(c(uid_col, outcome_col)), data_set, estimate)
## feature distributions by predicted probability decile
df_log_eval |>
select(all_of(uid_col), estimate) |>
bind_cols(bake(prep(log_recipe), new_data = df_to_score)) |>
mutate(across(where(is.factor), \(x) case_match(x, "yes" ~ 1L, .default = 0L))) |>
pivot_longer(cols = !c(all_of(c(uid_col, outcome_col)), estimate)) |>
mutate(decile = factor(ntile(estimate, n = 10L), seq_len(10L), ordered = TRUE)) |>
ggplot(aes(decile, value, fill = decile)) +
geom_violin(alpha = .8, show.legend = FALSE) +
facet_wrap(~ name, scales = "free_y")
## performance by predicted probability threshold
prob_thresholds <- seq(.0025, .1, .0025)
df_thresh <- df_log_eval |>
group_by(data_set) |>
probably::threshold_perf(
truth = all_of(outcome_col),
estimate = estimate,
thresholds = prob_thresholds,
event_level = event_lvl
)
df_thresh |>
ggplot(aes(.threshold, .estimate, col = data_set)) +
geom_line() +
facet_grid(rows = vars(.metric), scale = "free_y") +
scale_x_continuous(
breaks = c(.005, .01, .025, .05, .1), labels = label_percent(),
trans = "sqrt", expand = c(0,0)
) +
scale_y_continuous(labels = label_percent(), trans = "log1p") +
theme(
legend.position = "top", legend.justification = "right",
legend.margin = margin(t = -13, r = 5, b = -7)
) +
labs(title = "Predicted Probability Threshold Evaluation", col = "Data Set")
## select optimal threshold
event_thresh <- df_thresh |>
filter(.metric == "j_index") |>
mutate(wt = case_match(data_set, "vald" ~ 2L, .default = 1L)) |>
summarise(.estimate = weighted.mean(.estimate, wt), .by = .threshold) |>
slice_max(.estimate)
event_thresh |> glue::glue_data(
"Predicted Probability Threshold for a Positive Event: {percent(.threshold, .01)}"
)
## Predicted Probability Threshold for a Positive Event: 4.25%
## test set predictions
df_test_base <- process_predict(df_surgeries_test, df_claims_test, dataset = "test") |>
mutate(
PREDICTION = predict(log_fit, pick(all_of(feat_col)), type = "prob")$.pred_yes,
predicted_class = factor(
case_when(PREDICTION > event_thresh$.estimate ~ "yes", .default = "no"),
ind_lvls
)
) |>
select(all_of(uid_col), PREDICTION, predicted_class)
## class prediction summary table
df_test_base |>
summarise(n = n(), .by = predicted_class) |>
mutate(prop = round(n / sum(n), 4))
## # A tibble: 2 × 3
## predicted_class n prop
## <fct> <int> <dbl>
## 1 no 10390 0.990
## 2 yes 101 0.0096
## predicted probability distribution
df_test_base |>
ggplot(aes(PREDICTION)) +
geom_density(fill = "coral2", alpha = .8) +
scale_x_continuous(labels = label_percent(), trans = "sqrt")
df_test_base |>
readr::write_csv(dir_data("Surgeries_Test_Scored.csv"))
## Registered S3 methods overwritten by 'readr':
## method from
## as.data.frame.spec_tbl_df vroom
## as_tibble.spec_tbl_df vroom
## format.col_spec vroom
## print.col_spec vroom
## print.collector vroom
## print.date_names vroom
## print.locale vroom
## str.col_spec vroom
I think the surgery that we are attempting to predict is shoulder surgery based on the most common Dx codes. If a 2nd/3rd guesses count for anything then I’d go with knee surgery as the next most likely followed by the dark horse of dental surgery, although I think the Dx codes related to dental surgery may have been .